The data flow for this project requires several files:

getting a list of doi, for which there is doi_extract.py which runs on a wikipedia data dump using the mediawiki-utilities package by halfak
getting the page views. for each page in th doi_list.txt we need to get the page views. I have done this in a janky way, but it is outlined in the Get pageviews ipython notebook.



In [1]:

    
import json
import re
import operator
from collections import defaultdict
import pandas as pd



In [2]:

    
ls









    



cite doi analysis.ipynb     page_views_all.json
doi_list.txt                page_views_errors.json
doi_page_titles.json        page_views.json
doi_page_views.json         page_views.json~
dumpfile.html               README.md
Finding DOIs example.ipynb  research questions.txt
Get Pageviews.ipynb         testfile.compressed.1000meg
LICENSE



In [3]:

    
doi_list = open('doi_list.txt')



In [4]:

    
doi_lines = doi_list.readlines()



In [5]:

    
len(doi_lines)









    Out[5]:





27182



In [6]:

    
page_dois = defaultdict(list)
doi_pages = defaultdict(list)
prefixes = defaultdict(int)



In [7]:

    
for line in doi_lines:
    parts = re.split(r'\t|\n', line)
    page_title = parts[0]
    doi = parts[1].strip()
    #if len(doi) != len(parts[1]):
    #    print(parts[1], doi)
    if doi and (doi.lower() != 'noedit'):
        page_dois[page_title].append(doi)
        doi_pages[doi].append(page_title)
        prefix = doi.split('/')[0]
        prefixes[prefix] += 1



In [8]:

    
for doi, pages in doi_pages.iteritems():
    if doi.startswith(' '):
        print(doi, doi.strip())



In [9]:

    
num_page_dois = {page: len(dois) for page, dois in page_dois.iteritems()}
num_doi_pages = {doi: len(pages) for doi, pages in doi_pages.iteritems()}



In [10]:

    
npd = pd.DataFrame.from_dict(data=num_page_dois, orient='index')
ndp = pd.DataFrame.from_dict(data=num_doi_pages, orient='index')
prefixdf = pd.DataFrame.from_dict(data=prefixes, orient='index')



In [11]:

    
npdc = npd.convert_objects(convert_numeric=True)
ndpc = ndp.convert_objects(convert_numeric=True)
prefixdfc = prefixdf.convert_objects(convert_numeric=True)



In [30]:

    
npdc.sort([0], ascending=False).head(20)









    Out[30]:






  
    
      
      0
    
  
  
    
      Induced stem cells
       189
    
    
      List of Ig Nobel Prize winners
        91
    
    
      Asymmetric hydrogenation
        86
    
    
      Spinal muscular atrophy
        84
    
    
      Crystallographic defects in diamond
        80
    
    
      Fluorine
        78
    
    
      Fullerene chemistry
        54
    
    
      Choosing Wisely
        50
    
    
      Woolly mammoth
        47
    
    
      Health effects of tobacco
        43
    
    
      Management of schizophrenia
        40
    
    
      2-Norbornyl cation
        39
    
    
      Wolff–Kishner reduction
        39
    
    
      Hunterian Society
        37
    
    
      Nanogenerator
        37
    
    
      Fluid queue
        36
    
    
      American White Ibis
        35
    
    
      Phenols
        33
    
    
      Moroccan genetics
        32
    
    
      Assisted colonization
        32
    
  

20 rows × 1 columns



In [12]:

    
print npdc.sort([0], ascending=False).head(10).to_html(justify='left')









    



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: left;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>Induced stem cells</th>
      <td> 189</td>
    </tr>
    <tr>
      <th>List of Ig Nobel Prize winners</th>
      <td>  91</td>
    </tr>
    <tr>
      <th>Asymmetric hydrogenation</th>
      <td>  86</td>
    </tr>
    <tr>
      <th>Spinal muscular atrophy</th>
      <td>  84</td>
    </tr>
    <tr>
      <th>Crystallographic defects in diamond</th>
      <td>  80</td>
    </tr>
    <tr>
      <th>Fluorine</th>
      <td>  78</td>
    </tr>
    <tr>
      <th>Fullerene chemistry</th>
      <td>  54</td>
    </tr>
    <tr>
      <th>Choosing Wisely</th>
      <td>  50</td>
    </tr>
    <tr>
      <th>Woolly mammoth</th>
      <td>  47</td>
    </tr>
    <tr>
      <th>Health effects of tobacco</th>
      <td>  43</td>
    </tr>
  </tbody>
</table>



In [13]:

    
print ndpc.sort([0], ascending=False).head(11).to_html(justify='left')









    



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: left;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10.1128/MCB.22.19.6663-6668.2002</th>
      <td> 171</td>
    </tr>
    <tr>
      <th>10.1093/icb/icr006</th>
      <td> 132</td>
    </tr>
    <tr>
      <th>10.1093/nar/gkj002</th>
      <td>  75</td>
    </tr>
    <tr>
      <th>10.1128/MCB.24.13.5797-5807.2004</th>
      <td>  66</td>
    </tr>
    <tr>
      <th>10.1088.2F0004-6256.2F141.2F5.2F170</th>
      <td>  64</td>
    </tr>
    <tr>
      <th>10.1093/emboj/20.11.2943</th>
      <td>  50</td>
    </tr>
    <tr>
      <th>10.1088/0004-637X/753/2/156</th>
      <td>  38</td>
    </tr>
    <tr>
      <th>10.1088/0067-0049/197/2/19</th>
      <td>  33</td>
    </tr>
    <tr>
      <th>accessdate = 2012-10-25</th>
      <td>  30</td>
    </tr>
    <tr>
      <th>10.3897/zookeys.242.3856</th>
      <td>  28</td>
    </tr>
    <tr>
      <th>10.1073/pnas.242603899</th>
      <td>  27</td>
    </tr>
  </tbody>
</table>



In [14]:

    
print ndpc.sort([0], ascending=False).head(11).index









    



Index([u'10.1128/MCB.22.19.6663-6668.2002', u'10.1093/icb/icr006', u'10.1093/nar/gkj002', u'10.1128/MCB.24.13.5797-5807.2004', u'10.1088.2F0004-6256.2F141.2F5.2F170', u'10.1093/emboj/20.11.2943', u'10.1088/0004-637X/753/2/156', u'10.1088/0067-0049/197/2/19', u'accessdate = 2012-10-25', u'10.3897/zookeys.242.3856', u'10.1073/pnas.242603899'], dtype='object')



In [15]:

    
print prefixdf.sort([0], ascending=False).head(10).to_html()









    



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10.1016</th>
      <td> 3398</td>
    </tr>
    <tr>
      <th>10.1038</th>
      <td> 1879</td>
    </tr>
    <tr>
      <th>10.1007</th>
      <td> 1793</td>
    </tr>
    <tr>
      <th>10.1098</th>
      <td> 1716</td>
    </tr>
    <tr>
      <th>10.1111</th>
      <td> 1350</td>
    </tr>
    <tr>
      <th>10.1093</th>
      <td> 1203</td>
    </tr>
    <tr>
      <th>10.1002</th>
      <td>  960</td>
    </tr>
    <tr>
      <th>10.1021</th>
      <td>  873</td>
    </tr>
    <tr>
      <th>10.1126</th>
      <td>  821</td>
    </tr>
    <tr>
      <th>10.1080</th>
      <td>  704</td>
    </tr>
  </tbody>
</table>



In [16]:

    
page_views_list = json.load(open('page_views_all.json', 'r'))
page_views = {i[0]: i[1] for i in page_views_list}

def total_page_views(page_list):
    view_count = 0
    for page in page_list:
        try:
            view_count += page_views[page.decode('utf-8')]
        except:
            pass
            #print(page)
    return view_count

doi_views = {doi: total_page_views(page_list) for doi, page_list in doi_pages.iteritems() }
viewsdf = pd.DataFrame.from_dict(data=doi_views, orient='index')



In [18]:

    
print viewsdf.sort([0], ascending=False).head(10).to_html()









    



<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>10.1038/460787a</th>
      <td> 2767151</td>
    </tr>
    <tr>
      <th>10.1038/462545a</th>
      <td> 2766979</td>
    </tr>
    <tr>
      <th>10.1353/cwh.1969.0065</th>
      <td> 1382286</td>
    </tr>
    <tr>
      <th>10.1145/1284621.1284635</th>
      <td> 1313264</td>
    </tr>
    <tr>
      <th>10.1371/journal.pone.0028705</th>
      <td> 1216279</td>
    </tr>
    <tr>
      <th>10.1126/science.1173983</th>
      <td>  704326</td>
    </tr>
    <tr>
      <th>10.1038/nature06949</th>
      <td>  646251</td>
    </tr>
    <tr>
      <th>10.1073/pnas.0805721105</th>
      <td>  646251</td>
    </tr>
    <tr>
      <th>10.1080/10807030802387556</th>
      <td>  624108</td>
    </tr>
    <tr>
      <th>10.1098/rsbm.1955.0005</th>
      <td>  544353</td>
    </tr>
  </tbody>
</table>



In [23]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [29]:

    
npd.describe()









    Out[29]:






  
    
      
      0
    
  
  
    
      count
       11101.000000
    
    
      mean
           2.319791
    
    
      std
           3.955982
    
    
      min
           1.000000
    
    
      25%
           1.000000
    
    
      50%
           1.000000
    
    
      75%
           2.000000
    
    
      max
         189.000000
    
  

8 rows × 1 columns



In [42]:

    
len(npdc[npdc[0] == 1]) / float(len(npdc))









    Out[42]:





0.605981443113233



In [35]:

    
p = npd.hist(bins=40, log=True)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-ff2de86e1116> in <module>()
      1 p = npd.hist(bins=40, log=True)
----> 2 p.title('DOI citations among Wikipedia articles')

AttributeError: 'numpy.ndarray' object has no attribute 'title'



In [ ]:

	0
Induced stem cells	189
List of Ig Nobel Prize winners	91
Asymmetric hydrogenation	86
Spinal muscular atrophy	84
Crystallographic defects in diamond	80
Fluorine	78
Fullerene chemistry	54
Choosing Wisely	50
Woolly mammoth	47
Health effects of tobacco	43
Management of schizophrenia	40
2-Norbornyl cation	39
Wolff–Kishner reduction	39
Hunterian Society	37
Nanogenerator	37
Fluid queue	36
American White Ibis	35
Phenols	33
Moroccan genetics	32
Assisted colonization	32

	0
count	11101.000000
mean	2.319791
std	3.955982
min	1.000000
25%	1.000000
50%	1.000000
75%	2.000000
max	189.000000